
package com.jstarcraft.ai.jsat.clustering.hierarchical;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.PriorityQueue;

import com.jstarcraft.ai.jsat.DataSet;
import com.jstarcraft.ai.jsat.SimpleDataSet;
import com.jstarcraft.ai.jsat.classifiers.DataPoint;
import com.jstarcraft.ai.jsat.clustering.ClusterFailureException;
import com.jstarcraft.ai.jsat.clustering.KClusterer;
import com.jstarcraft.ai.jsat.clustering.KClustererBase;
import com.jstarcraft.ai.jsat.clustering.evaluation.ClusterEvaluation;

/**
 * DivisiveLocalClusterer is a hierarchical clustering method that works by
 * splitting the data set into sub trees from the top down. Unlike many top-up
 * methods, such as {@link SimpleHAC}, top-down methods require another
 * clustering method to perform the splitting at each iteration. If the base
 * method is not deterministic, then the top-down method will not be
 * deterministic. <br>
 * <br>
 * Specifically, DivisiveLocalClusterer greedily chooses the cluster to split
 * based on an evaluation of only the cluster being split. Because of this local
 * search of the world, DivisiveLocalClusterer has poor performance in
 * determining the number of clusters in the data set. As such, only the methods
 * where the exact number of clusters are recommended. <br>
 * This greedy strategy can also lead to drilling down clusters into small
 * parts, and works best when only a small number of clusters are needed.
 * 
 * @author Edward Raff
 */
public class DivisiveLocalClusterer extends KClustererBase {

    private static final long serialVersionUID = 8616401472810067778L;
    private KClusterer baseClusterer;
    private ClusterEvaluation clusterEvaluation;

    public DivisiveLocalClusterer(KClusterer baseClusterer, ClusterEvaluation clusterEvaluation) {
        this.baseClusterer = baseClusterer;
        this.clusterEvaluation = clusterEvaluation;
    }

    /**
     * Copy constructor
     * 
     * @param toCopy the object to copy
     */
    public DivisiveLocalClusterer(DivisiveLocalClusterer toCopy) {
        this(toCopy.baseClusterer.clone(), toCopy.clusterEvaluation.clone());
    }

    @Override
    public int[] cluster(DataSet dataSet, int[] designations) {
        return cluster(dataSet, 2, (int) Math.sqrt(dataSet.size()), designations);
    }

    @Override
    public int[] cluster(DataSet dataSet, boolean parallel, int[] designations) {
        return cluster(dataSet, 2, (int) Math.sqrt(dataSet.size()), parallel, designations);
    }

    @Override
    public int[] cluster(DataSet dataSet, int clusters, boolean parallel, int[] designations) {
        if (designations == null)
            designations = new int[dataSet.size()];
        /**
         * For each current cluster, we store the clustering results if we attempt to
         * split it into two. <br>
         * Each row needs to be re-set since the clustering methods will use the length
         * of the cluster size
         */
        final int[][] subDesignation = new int[clusters][];
        /**
         * Stores the index from the sub data set into the full data set
         */
        final int[][] originalPositions = new int[clusters][dataSet.size()];

        /**
         * Stores the dissimilarity of the splitting of the cluster with the same index
         * value. Negative value indicates not set
         */
        final double[] splitEvaluation = new double[clusters];

        PriorityQueue<Integer> clusterToSplit = new PriorityQueue<>(clusters, (Integer t, Integer t1) -> Double.compare(splitEvaluation[t], splitEvaluation[t1]));
        clusterToSplit.add(0);// We must start out spliting the one cluster of everyone!

        Arrays.fill(designations, 0);

        // Create initial split we will start from
        baseClusterer.cluster(dataSet, 2, parallel, designations);
        subDesignation[0] = Arrays.copyOf(designations, designations.length);
        for (int i = 0; i < originalPositions[0].length; i++)
            originalPositions[0][i] = i;

        List<DataPoint> dpSubC1 = new ArrayList<>();
        List<DataPoint> dpSubC2 = new ArrayList<>();

        /*
         * TODO it could be updated to use the split value to do a search range and stop
         * when a large jump occurs. This will perform poorl and underestimate the
         * number of clusters, becase it will decend one path as splitting clusters
         * improves untill it gets to the correct cluster size. Popping back up will
         * then casue an increase, which will cause the early termination.
         */
        for (int k = 1; k < clusters; k++) {
            int useSplit = clusterToSplit.poll();

            int newClusterID = k;

            dpSubC1.clear();
            dpSubC2.clear();

            // Split the data set into its two sub data sets
            for (int i = 0; i < subDesignation[useSplit].length; i++) {
                int origPos = originalPositions[useSplit][i];
                if (subDesignation[useSplit][i] == 0) {
                    dpSubC1.add(dataSet.getDataPoint(origPos));
                    continue;// We will asigng cluster '1' to be the new cluster number
                }
                dpSubC2.add(dataSet.getDataPoint(origPos));
                designations[origPos] = newClusterID;
            }

            computeSubClusterSplit(subDesignation, useSplit, dpSubC1, dataSet, designations, originalPositions, splitEvaluation, clusterToSplit, parallel);

            computeSubClusterSplit(subDesignation, newClusterID, dpSubC2, dataSet, designations, originalPositions, splitEvaluation, clusterToSplit, parallel);
        }

        return designations;
    }

    /**
     * Takes the data set and computes the clustering of a sub cluster, and stores
     * its information, and places the result in the queue
     * 
     * @param subDesignation            the array of arrays to store the designation
     *                                  array for the sub clustering
     * @param originalCluster           the originalCluster that we want to store
     *                                  the split information of
     * @param listOfDataPointsInCluster the list of all data points that belong to
     *                                  <tt>originalCluster</tt>
     * @param fullDataSet               the full original data set
     * @param fullDesignations          the designation array for the full original
     *                                  data set
     * @param originalPositions         the array of arrays to store the map from
     *                                  and index in
     *                                  <tt>listOfDataPointsInCluster</tt> to its
     *                                  index in the full data set.
     * @param splitEvaluation           the array to store the cluster evaluation of
     *                                  the data set
     * @param clusterToSplit            the priority queue that stores the cluster
     *                                  id and sorts based on how good the sub
     *                                  splits were.
     */
    private void computeSubClusterSplit(final int[][] subDesignation, int originalCluster, List<DataPoint> listOfDataPointsInCluster, DataSet fullDataSet, int[] fullDesignations, final int[][] originalPositions, final double[] splitEvaluation, PriorityQueue<Integer> clusterToSplit, boolean parallel) {
        subDesignation[originalCluster] = new int[listOfDataPointsInCluster.size()];
        int pos = 0;
        for (int i = 0; i < fullDataSet.size(); i++) {
            if (fullDesignations[i] != originalCluster)
                continue;
            originalPositions[originalCluster][pos++] = i;
        }
        // Cluster the sub cluster
        SimpleDataSet dpSubC1DataSet = new SimpleDataSet(listOfDataPointsInCluster);
        try {
            baseClusterer.cluster(dpSubC1DataSet, 2, parallel, subDesignation[originalCluster]);
            splitEvaluation[originalCluster] = clusterEvaluation.evaluate(subDesignation[originalCluster], dpSubC1DataSet);
            clusterToSplit.add(originalCluster);
        } catch (ClusterFailureException ex) {
            splitEvaluation[originalCluster] = Double.POSITIVE_INFINITY;
        }

    }

    @Override
    public int[] cluster(DataSet dataSet, int lowK, int highK, boolean parallel, int[] designations) {
        return cluster(dataSet, lowK, parallel, designations);
    }

    @Override
    public DivisiveLocalClusterer clone() {
        return new DivisiveLocalClusterer(this);
    }
}
